#installing the necessary packages if not pre-installed
# !pip install pandas
# !pip install plotly
# !pip install seaborn
# !pip install matplotlib
# !pip install numpy
# !pip install nbformat
# !pip install chart_studio
# !pip install matplotlib-colorbar
# !pip install category_encoders
#importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.patches as patches
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
Read the csv file¶
#importing the e-commerce dataset into a dataframe
ecd = pd.read_csv('E-Commerce Churn Data.csv')
ecd
| CustomerID | Churn | Tenure | PreferredLoginDevice | CityTier | WarehouseToHome | PreferredPaymentMode | Gender | HourSpendOnApp | NumberOfDeviceRegistered | PreferedOrderCat | SatisfactionScore | MaritalStatus | NumberOfAddress | Complain | OrderAmountHikeFromlastYear | CouponUsed | OrderCount | DaySinceLastOrder | CashbackAmount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50001 | 1 | 4.0 | Mobile Phone | 3 | 6.0 | Debit Card | Female | 3.0 | 3 | Laptop & Accessory | 2 | Single | 9 | 1 | 11.0 | 1.0 | 1.0 | 5.0 | 160 |
| 1 | 50002 | 1 | NaN | Phone | 1 | 8.0 | UPI | Male | 3.0 | 4 | Mobile | 3 | Single | 7 | 1 | 15.0 | 0.0 | 1.0 | 0.0 | 121 |
| 2 | 50003 | 1 | NaN | Phone | 1 | 30.0 | Debit Card | Male | 2.0 | 4 | Mobile | 3 | Single | 6 | 1 | 14.0 | 0.0 | 1.0 | 3.0 | 120 |
| 3 | 50004 | 1 | 0.0 | Phone | 3 | 15.0 | Debit Card | Male | 2.0 | 4 | Laptop & Accessory | 5 | Single | 8 | 0 | 23.0 | 0.0 | 1.0 | 3.0 | 134 |
| 4 | 50005 | 1 | 0.0 | Phone | 1 | 12.0 | CC | Male | NaN | 3 | Mobile | 5 | Single | 3 | 0 | 11.0 | 1.0 | 1.0 | 3.0 | 130 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5625 | 55626 | 0 | 10.0 | Computer | 1 | 30.0 | Credit Card | Male | 3.0 | 2 | Laptop & Accessory | 1 | Married | 6 | 0 | 18.0 | 1.0 | 2.0 | 4.0 | 151 |
| 5626 | 55627 | 0 | 13.0 | Mobile Phone | 1 | 13.0 | Credit Card | Male | 3.0 | 5 | Fashion | 5 | Married | 6 | 0 | 16.0 | 1.0 | 2.0 | NaN | 225 |
| 5627 | 55628 | 0 | 1.0 | Mobile Phone | 1 | 11.0 | Debit Card | Male | 3.0 | 2 | Laptop & Accessory | 4 | Married | 3 | 1 | 21.0 | 1.0 | 2.0 | 4.0 | 186 |
| 5628 | 55629 | 0 | 23.0 | Computer | 3 | 9.0 | Credit Card | Male | 4.0 | 5 | Laptop & Accessory | 4 | Married | 4 | 0 | 15.0 | 2.0 | 2.0 | 9.0 | 179 |
| 5629 | 55630 | 0 | 8.0 | Mobile Phone | 1 | 15.0 | Credit Card | Male | 3.0 | 2 | Laptop & Accessory | 3 | Married | 4 | 0 | 13.0 | 2.0 | 2.0 | 3.0 | 169 |
5630 rows × 20 columns
#checking the column names
ecd.columns
Index(['CustomerID', 'Churn', 'Tenure', 'PreferredLoginDevice', 'CityTier',
'WarehouseToHome', 'PreferredPaymentMode', 'Gender', 'HourSpendOnApp',
'NumberOfDeviceRegistered', 'PreferedOrderCat', 'SatisfactionScore',
'MaritalStatus', 'NumberOfAddress', 'Complain',
'OrderAmountHikeFromlastYear', 'CouponUsed', 'OrderCount',
'DaySinceLastOrder', 'CashbackAmount'],
dtype='object')
Data Dictionary¶
| Variable | Description |
|---|---|
| CustomerID | Unique customer ID |
| Churn | Churn Flag (Yes = 1; No = 0) |
| Tenure | Tenure of customer in organization (i.e. length of time using the service) |
| PreferredLoginDevice | Preferred login device of customer |
| CityTier | City tier |
| WarehouseToHome | Distance in between warehouse to home of customer |
| PreferredPaymentMode | Preferred payment method of customer |
| Gender | Gender of customer |
| HourSpendOnApp | Number of hours spent on mobile application or website |
| NumberOfDeviceRegistered | Total number of devices registered to particular customer |
| PreferedOrderCat | Preferred order category of customer in last month |
| SatisfactionScore | Level of customer satisfaction with service |
| MaritalStatus | Marital status of customer |
| NumberOfAddress | Total number of addresses added on particular customer |
| Complain | Any complaint has been raised in last month |
| OrderAmountHikeFromlastYear | Percentage increase in order from last year |
| CouponUsed | Total number of coupons used in last month |
| OrderCount | Total number of orders placed in last month |
| DaySinceLastOrder | Days since last order by customer |
| CashbackAmount | Average cashback in last month |
Exploring the dataset and Cleaning data¶
#checking the description of dataset
pd.options.display.float_format = '{:20,.2f}'.format
ecd.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| CustomerID | 5,630.00 | 52,815.50 | 1,625.39 | 50,001.00 | 51,408.25 | 52,815.50 | 54,222.75 | 55,630.00 |
| Churn | 5,630.00 | 0.17 | 0.37 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| Tenure | 5,366.00 | 10.19 | 8.56 | 0.00 | 2.00 | 9.00 | 16.00 | 61.00 |
| CityTier | 5,630.00 | 1.65 | 0.92 | 1.00 | 1.00 | 1.00 | 3.00 | 3.00 |
| WarehouseToHome | 5,379.00 | 15.64 | 8.53 | 5.00 | 9.00 | 14.00 | 20.00 | 127.00 |
| HourSpendOnApp | 5,375.00 | 2.93 | 0.72 | 0.00 | 2.00 | 3.00 | 3.00 | 5.00 |
| NumberOfDeviceRegistered | 5,630.00 | 3.69 | 1.02 | 1.00 | 3.00 | 4.00 | 4.00 | 6.00 |
| SatisfactionScore | 5,630.00 | 3.07 | 1.38 | 1.00 | 2.00 | 3.00 | 4.00 | 5.00 |
| NumberOfAddress | 5,630.00 | 4.21 | 2.58 | 1.00 | 2.00 | 3.00 | 6.00 | 22.00 |
| Complain | 5,630.00 | 0.28 | 0.45 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 |
| OrderAmountHikeFromlastYear | 5,365.00 | 15.71 | 3.68 | 11.00 | 13.00 | 15.00 | 18.00 | 26.00 |
| CouponUsed | 5,374.00 | 1.75 | 1.89 | 0.00 | 1.00 | 1.00 | 2.00 | 16.00 |
| OrderCount | 5,372.00 | 3.01 | 2.94 | 1.00 | 1.00 | 2.00 | 3.00 | 16.00 |
| DaySinceLastOrder | 5,323.00 | 4.54 | 3.65 | 0.00 | 2.00 | 3.00 | 7.00 | 46.00 |
| CashbackAmount | 5,630.00 | 177.22 | 49.19 | 0.00 | 146.00 | 163.00 | 196.00 | 325.00 |
#checking the first 10 records of the dataset
ecd.head()
| CustomerID | Churn | Tenure | PreferredLoginDevice | CityTier | WarehouseToHome | PreferredPaymentMode | Gender | HourSpendOnApp | NumberOfDeviceRegistered | PreferedOrderCat | SatisfactionScore | MaritalStatus | NumberOfAddress | Complain | OrderAmountHikeFromlastYear | CouponUsed | OrderCount | DaySinceLastOrder | CashbackAmount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50001 | 1 | 4.00 | Mobile Phone | 3 | 6.00 | Debit Card | Female | 3.00 | 3 | Laptop & Accessory | 2 | Single | 9 | 1 | 11.00 | 1.00 | 1.00 | 5.00 | 160 |
| 1 | 50002 | 1 | NaN | Phone | 1 | 8.00 | UPI | Male | 3.00 | 4 | Mobile | 3 | Single | 7 | 1 | 15.00 | 0.00 | 1.00 | 0.00 | 121 |
| 2 | 50003 | 1 | NaN | Phone | 1 | 30.00 | Debit Card | Male | 2.00 | 4 | Mobile | 3 | Single | 6 | 1 | 14.00 | 0.00 | 1.00 | 3.00 | 120 |
| 3 | 50004 | 1 | 0.00 | Phone | 3 | 15.00 | Debit Card | Male | 2.00 | 4 | Laptop & Accessory | 5 | Single | 8 | 0 | 23.00 | 0.00 | 1.00 | 3.00 | 134 |
| 4 | 50005 | 1 | 0.00 | Phone | 1 | 12.00 | CC | Male | NaN | 3 | Mobile | 5 | Single | 3 | 0 | 11.00 | 1.00 | 1.00 | 3.00 | 130 |
ecd.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5630 entries, 0 to 5629 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 5630 non-null int64 1 Churn 5630 non-null int64 2 Tenure 5366 non-null float64 3 PreferredLoginDevice 5630 non-null object 4 CityTier 5630 non-null int64 5 WarehouseToHome 5379 non-null float64 6 PreferredPaymentMode 5630 non-null object 7 Gender 5630 non-null object 8 HourSpendOnApp 5375 non-null float64 9 NumberOfDeviceRegistered 5630 non-null int64 10 PreferedOrderCat 5630 non-null object 11 SatisfactionScore 5630 non-null int64 12 MaritalStatus 5630 non-null object 13 NumberOfAddress 5630 non-null int64 14 Complain 5630 non-null int64 15 OrderAmountHikeFromlastYear 5365 non-null float64 16 CouponUsed 5374 non-null float64 17 OrderCount 5372 non-null float64 18 DaySinceLastOrder 5323 non-null float64 19 CashbackAmount 5630 non-null int64 dtypes: float64(7), int64(8), object(5) memory usage: 879.8+ KB
#Detecting missing values
ecd.isnull().any()
CustomerID False Churn False Tenure True PreferredLoginDevice False CityTier False WarehouseToHome True PreferredPaymentMode False Gender False HourSpendOnApp True NumberOfDeviceRegistered False PreferedOrderCat False SatisfactionScore False MaritalStatus False NumberOfAddress False Complain False OrderAmountHikeFromlastYear True CouponUsed True OrderCount True DaySinceLastOrder True CashbackAmount False dtype: bool
#counting the missing values
ecd.isna().sum()
CustomerID 0 Churn 0 Tenure 264 PreferredLoginDevice 0 CityTier 0 WarehouseToHome 251 PreferredPaymentMode 0 Gender 0 HourSpendOnApp 255 NumberOfDeviceRegistered 0 PreferedOrderCat 0 SatisfactionScore 0 MaritalStatus 0 NumberOfAddress 0 Complain 0 OrderAmountHikeFromlastYear 265 CouponUsed 256 OrderCount 258 DaySinceLastOrder 307 CashbackAmount 0 dtype: int64
#plotting the missing values
ecd.isna().sum().plot(kind='bar', color='Red')
plt.show()
#total number of missing values in our dataset
ecd.isna().sum().sum()
1856
#creating a function to display the data type, percentage of missing values and number of unique values per column
def sniff_modified(df):
with pd.option_context("display.max_colwidth", 20):
info = pd.DataFrame()
info['data type'] = df.dtypes
info['percent missing'] = df.isnull().sum()*100/len(df)
info['No. unique'] = df.apply(lambda x: len(x.unique()))
info['unique values'] = df.apply(lambda x: x.unique())
return info.sort_values('data type')
sniff_modified(ecd)
| data type | percent missing | No. unique | unique values | |
|---|---|---|---|---|
| CustomerID | int64 | 0.00 | 5630 | [50001, 50002, 50003, 50004, 50005, 50006, 500... |
| Complain | int64 | 0.00 | 2 | [1, 0] |
| NumberOfAddress | int64 | 0.00 | 15 | [9, 7, 6, 8, 3, 2, 4, 10, 1, 5, 19, 21, 11, 20... |
| SatisfactionScore | int64 | 0.00 | 5 | [2, 3, 5, 4, 1] |
| NumberOfDeviceRegistered | int64 | 0.00 | 6 | [3, 4, 5, 2, 1, 6] |
| Churn | int64 | 0.00 | 2 | [1, 0] |
| CityTier | int64 | 0.00 | 3 | [3, 1, 2] |
| CashbackAmount | int64 | 0.00 | 220 | [160, 121, 120, 134, 130, 139, 123, 127, 295, ... |
| WarehouseToHome | float64 | 4.46 | 35 | [6.0, 8.0, 30.0, 15.0, 12.0, 22.0, 11.0, 9.0, ... |
| HourSpendOnApp | float64 | 4.53 | 7 | [3.0, 2.0, nan, 1.0, 0.0, 4.0, 5.0] |
| DaySinceLastOrder | float64 | 5.45 | 23 | [5.0, 0.0, 3.0, 7.0, 2.0, 1.0, 8.0, 6.0, 4.0, ... |
| Tenure | float64 | 4.69 | 37 | [4.0, nan, 0.0, 13.0, 11.0, 9.0, 19.0, 20.0, 1... |
| OrderAmountHikeFromlastYear | float64 | 4.71 | 17 | [11.0, 15.0, 14.0, 23.0, 22.0, 16.0, 12.0, nan... |
| CouponUsed | float64 | 4.55 | 18 | [1.0, 0.0, 4.0, 2.0, 9.0, 6.0, 11.0, nan, 7.0,... |
| OrderCount | float64 | 4.58 | 17 | [1.0, 6.0, 2.0, 15.0, 4.0, 7.0, 3.0, 9.0, nan,... |
| PreferredPaymentMode | object | 0.00 | 7 | [Debit Card, UPI, CC, Cash on Delivery, E wall... |
| Gender | object | 0.00 | 2 | [Female, Male] |
| PreferedOrderCat | object | 0.00 | 6 | [Laptop & Accessory, Mobile, Mobile Phone, Oth... |
| PreferredLoginDevice | object | 0.00 | 3 | [Mobile Phone, Phone, Computer] |
| MaritalStatus | object | 0.00 | 3 | [Single, Divorced, Married] |
#creating a function to find the columns with missing values, extract the number and percentage of these missing values in relation to the dataset
def FindMissingColsPercentage(df):
total = 0
for col in df.columns:
missing_vals = df[col].isnull().sum()
# mean = sum / total
pct = df[col].isnull().mean() * 100
if missing_vals != 0:
print('{} => {} [{}%]'.format(col, df[col].isnull().sum(), round(pct, 2)))
total += missing_vals
if total == 0:
print("no missing values")
FindMissingColsPercentage(ecd)
Tenure => 264 [4.69%] WarehouseToHome => 251 [4.46%] HourSpendOnApp => 255 [4.53%] OrderAmountHikeFromlastYear => 265 [4.71%] CouponUsed => 256 [4.55%] OrderCount => 258 [4.58%] DaySinceLastOrder => 307 [5.45%]
#replacing all empty spaces with np.NaN
ecd_clean = ecd.replace(" ", np.NaN)
# replacing all missing values(NaN) in the dataset with 0
ecd_clean = ecd_clean.fillna(0)
#checking dataset information after replacing the missing values
ecd_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5630 entries, 0 to 5629 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 5630 non-null int64 1 Churn 5630 non-null int64 2 Tenure 5630 non-null float64 3 PreferredLoginDevice 5630 non-null object 4 CityTier 5630 non-null int64 5 WarehouseToHome 5630 non-null float64 6 PreferredPaymentMode 5630 non-null object 7 Gender 5630 non-null object 8 HourSpendOnApp 5630 non-null float64 9 NumberOfDeviceRegistered 5630 non-null int64 10 PreferedOrderCat 5630 non-null object 11 SatisfactionScore 5630 non-null int64 12 MaritalStatus 5630 non-null object 13 NumberOfAddress 5630 non-null int64 14 Complain 5630 non-null int64 15 OrderAmountHikeFromlastYear 5630 non-null float64 16 CouponUsed 5630 non-null float64 17 OrderCount 5630 non-null float64 18 DaySinceLastOrder 5630 non-null float64 19 CashbackAmount 5630 non-null int64 dtypes: float64(7), int64(8), object(5) memory usage: 879.8+ KB
ecd_clean.isnull().any()
CustomerID False Churn False Tenure False PreferredLoginDevice False CityTier False WarehouseToHome False PreferredPaymentMode False Gender False HourSpendOnApp False NumberOfDeviceRegistered False PreferedOrderCat False SatisfactionScore False MaritalStatus False NumberOfAddress False Complain False OrderAmountHikeFromlastYear False CouponUsed False OrderCount False DaySinceLastOrder False CashbackAmount False dtype: bool
FindMissingColsPercentage(ecd_clean)
no missing values
#viewing the object data type columns
ecd_clean[['PreferredPaymentMode', 'Gender', 'PreferedOrderCat', 'PreferredLoginDevice', 'MaritalStatus']]
| PreferredPaymentMode | Gender | PreferedOrderCat | PreferredLoginDevice | MaritalStatus | |
|---|---|---|---|---|---|
| 0 | Debit Card | Female | Laptop & Accessory | Mobile Phone | Single |
| 1 | UPI | Male | Mobile | Phone | Single |
| 2 | Debit Card | Male | Mobile | Phone | Single |
| 3 | Debit Card | Male | Laptop & Accessory | Phone | Single |
| 4 | CC | Male | Mobile | Phone | Single |
| ... | ... | ... | ... | ... | ... |
| 5625 | Credit Card | Male | Laptop & Accessory | Computer | Married |
| 5626 | Credit Card | Male | Fashion | Mobile Phone | Married |
| 5627 | Debit Card | Male | Laptop & Accessory | Mobile Phone | Married |
| 5628 | Credit Card | Male | Laptop & Accessory | Computer | Married |
| 5629 | Credit Card | Male | Laptop & Accessory | Mobile Phone | Married |
5630 rows × 5 columns
#checking the unique values in these columns
obj = ['PreferredPaymentMode', 'Gender', 'PreferedOrderCat', 'PreferredLoginDevice', 'MaritalStatus'] #creating a list to the column names
for i in obj: #creating a for loop to print out the column name and unique values and count
#print(i, ecd_clean[i].nunique(), '\n',ecd_clean[i].unique())
print(ecd_clean[i].value_counts(),'\n')
PreferredPaymentMode Debit Card 2314 Credit Card 1501 E wallet 614 UPI 414 COD 365 CC 273 Cash on Delivery 149 Name: count, dtype: int64 Gender Male 3384 Female 2246 Name: count, dtype: int64 PreferedOrderCat Laptop & Accessory 2050 Mobile Phone 1271 Fashion 826 Mobile 809 Grocery 410 Others 264 Name: count, dtype: int64 PreferredLoginDevice Mobile Phone 2765 Computer 1634 Phone 1231 Name: count, dtype: int64 MaritalStatus Married 2986 Single 1796 Divorced 848 Name: count, dtype: int64
#replacing the data entries that has the same meaning
ecd_clean['PreferredPaymentMode'] = ecd_clean['PreferredPaymentMode'].replace('CC', 'Credit Card')
ecd_clean['PreferredPaymentMode'] = ecd_clean['PreferredPaymentMode'].replace('COD', 'Cash on Delivery')
ecd_clean['PreferedOrderCat'] = ecd_clean['PreferedOrderCat'].replace('Mobile', 'Mobile Phone')
ecd_clean['PreferredLoginDevice'] = ecd_clean['PreferredLoginDevice'].replace('Phone', 'Mobile Phone')
#cross-checking the replacement has been effected
for i in obj: #creating a for loop to print out the column name and unique values and count
print(ecd_clean[i].value_counts(),'\n')
PreferredPaymentMode Debit Card 2314 Credit Card 1774 E wallet 614 Cash on Delivery 514 UPI 414 Name: count, dtype: int64 Gender Male 3384 Female 2246 Name: count, dtype: int64 PreferedOrderCat Mobile Phone 2080 Laptop & Accessory 2050 Fashion 826 Grocery 410 Others 264 Name: count, dtype: int64 PreferredLoginDevice Mobile Phone 3996 Computer 1634 Name: count, dtype: int64 MaritalStatus Married 2986 Single 1796 Divorced 848 Name: count, dtype: int64
#creating categories from tenure column into a new column - tenuregroup
ranges = [0,10,20,30,40,50,60,np.inf] #list to hold the bin ranges
group_names = ['0-10 years', '11-20 years', '21-30 years', '31-40 years', '41-50 years', '51-60 years', '61 years & above'] # list to hold the labels
ecd_clean['TenureGroup'] = pd.cut(ecd_clean['Tenure'], bins = ranges, labels = group_names, include_lowest = True)
ecd_clean[['Tenure', 'TenureGroup']]
| Tenure | TenureGroup | |
|---|---|---|
| 0 | 4.00 | 0-10 years |
| 1 | 0.00 | 0-10 years |
| 2 | 0.00 | 0-10 years |
| 3 | 0.00 | 0-10 years |
| 4 | 0.00 | 0-10 years |
| ... | ... | ... |
| 5625 | 10.00 | 0-10 years |
| 5626 | 13.00 | 11-20 years |
| 5627 | 1.00 | 0-10 years |
| 5628 | 23.00 | 21-30 years |
| 5629 | 8.00 | 0-10 years |
5630 rows × 2 columns
#checking the final clean data
ecd_clean.head(20), ecd_clean.shape
( CustomerID Churn Tenure PreferredLoginDevice CityTier \
0 50001 1 4.00 Mobile Phone 3
1 50002 1 0.00 Mobile Phone 1
2 50003 1 0.00 Mobile Phone 1
3 50004 1 0.00 Mobile Phone 3
4 50005 1 0.00 Mobile Phone 1
5 50006 1 0.00 Computer 1
6 50007 1 0.00 Mobile Phone 3
7 50008 1 0.00 Mobile Phone 1
8 50009 1 13.00 Mobile Phone 3
9 50010 1 0.00 Mobile Phone 1
10 50011 1 4.00 Mobile Phone 1
11 50012 1 11.00 Mobile Phone 1
12 50013 1 0.00 Mobile Phone 1
13 50014 1 0.00 Mobile Phone 1
14 50015 1 9.00 Mobile Phone 3
15 50016 1 0.00 Mobile Phone 2
16 50017 1 0.00 Computer 1
17 50018 1 0.00 Mobile Phone 3
18 50019 1 0.00 Computer 1
19 50020 1 19.00 Mobile Phone 1
WarehouseToHome PreferredPaymentMode Gender HourSpendOnApp \
0 6.00 Debit Card Female 3.00
1 8.00 UPI Male 3.00
2 30.00 Debit Card Male 2.00
3 15.00 Debit Card Male 2.00
4 12.00 Credit Card Male 0.00
5 22.00 Debit Card Female 3.00
6 11.00 Cash on Delivery Male 2.00
7 6.00 Credit Card Male 3.00
8 9.00 E wallet Male 0.00
9 31.00 Debit Card Male 2.00
10 18.00 Cash on Delivery Female 2.00
11 6.00 Debit Card Male 3.00
12 11.00 Cash on Delivery Male 2.00
13 15.00 Credit Card Male 3.00
14 15.00 Credit Card Male 3.00
15 12.00 UPI Male 3.00
16 12.00 Debit Card Female 0.00
17 11.00 E wallet Male 2.00
18 13.00 Debit Card Male 3.00
19 20.00 Debit Card Female 3.00
NumberOfDeviceRegistered ... SatisfactionScore MaritalStatus \
0 3 ... 2 Single
1 4 ... 3 Single
2 4 ... 3 Single
3 4 ... 5 Single
4 3 ... 5 Single
5 5 ... 5 Single
6 3 ... 2 Divorced
7 3 ... 2 Divorced
8 4 ... 3 Divorced
9 5 ... 3 Single
10 3 ... 3 Divorced
11 4 ... 3 Single
12 3 ... 3 Single
13 4 ... 3 Divorced
14 4 ... 2 Single
15 3 ... 5 Married
16 4 ... 2 Single
17 4 ... 3 Single
18 5 ... 3 Single
19 3 ... 4 Divorced
NumberOfAddress Complain OrderAmountHikeFromlastYear \
0 9 1 11.00
1 7 1 15.00
2 6 1 14.00
3 8 0 23.00
4 3 0 11.00
5 2 1 22.00
6 4 0 14.00
7 3 1 16.00
8 2 1 14.00
9 2 0 12.00
10 2 0 0.00
11 10 1 13.00
12 2 1 13.00
13 1 1 17.00
14 2 0 16.00
15 5 1 22.00
16 2 1 18.00
17 2 1 11.00
18 2 1 24.00
19 10 1 18.00
CouponUsed OrderCount DaySinceLastOrder \
0 1.00 1.00 5.00
1 0.00 1.00 0.00
2 0.00 1.00 3.00
3 0.00 1.00 3.00
4 1.00 1.00 3.00
5 4.00 6.00 7.00
6 0.00 1.00 0.00
7 2.00 2.00 0.00
8 0.00 1.00 2.00
9 1.00 1.00 1.00
10 9.00 15.00 8.00
11 0.00 1.00 0.00
12 2.00 2.00 2.00
13 0.00 1.00 0.00
14 0.00 4.00 7.00
15 1.00 1.00 2.00
16 1.00 1.00 0.00
17 1.00 1.00 3.00
18 1.00 1.00 6.00
19 1.00 4.00 3.00
CashbackAmount TenureGroup
0 160 0-10 years
1 121 0-10 years
2 120 0-10 years
3 134 0-10 years
4 130 0-10 years
5 139 0-10 years
6 121 0-10 years
7 123 0-10 years
8 127 11-20 years
9 123 0-10 years
10 295 0-10 years
11 154 11-20 years
12 134 0-10 years
13 134 0-10 years
14 196 0-10 years
15 121 0-10 years
16 129 0-10 years
17 157 0-10 years
18 161 0-10 years
19 150 11-20 years
[20 rows x 21 columns],
(5630, 21))
Exploratory Data Analysis¶
# Plotting graphs for better understanding of data distribution
cols = ['HourSpendOnApp', 'NumberOfDeviceRegistered', 'SatisfactionScore', 'OrderAmountHikeFromlastYear', 'CouponUsed', 'DaySinceLastOrder']
ecd[cols].hist(figsize=(15,15), bins=15, color='Red')
array([[<Axes: title={'center': 'HourSpendOnApp'}>,
<Axes: title={'center': 'NumberOfDeviceRegistered'}>],
[<Axes: title={'center': 'SatisfactionScore'}>,
<Axes: title={'center': 'OrderAmountHikeFromlastYear'}>],
[<Axes: title={'center': 'CouponUsed'}>,
<Axes: title={'center': 'DaySinceLastOrder'}>]], dtype=object)
ecd_1 = ecd_clean.copy() #creating a copy of the cleaned dataset
churn_labels = {0: "Customer Retained", 1: "Customer Churned"} #creating a dictionary for the churn labels
ecd_1['Churn'] = ecd_1['Churn'].replace(churn_labels) #replacing the binary numbers in the column to the new churn labels
#visualising customer churn rate
sns.countplot(x = 'Churn', data = ecd_1, palette=sns.color_palette('RdBu'), order=['Customer Retained', 'Customer Churned'])
plt.title("Churn Rate")
plt.show()
From the above plot, we can say more than 50% of the e-commerce company's customer base was retained while smaller portion stopped using their services.
#visualing the churn rate in percentages.
Cust_Churn_Count = (ecd_clean['Churn'] == 1).sum()
Cust_Retained_count = (ecd_clean['Churn'] == 0).sum()
# Defining the labels and sizes
labels = 'Customers Churned', 'Customers Retained'
sizes = [Cust_Churn_Count, Cust_Retained_count]
colors_given = ['#ff7f0e', '#66B3AA']
# Define the explode parameter (how much to pull apart the slices)
explode = (0.15, 0) # Pull apart the first slice (Customer Left) by 15%
fig, ax = plt.subplots()
# Creating the pie chart
ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, wedgeprops={'edgecolor': 'black'}, colors=colors_given, explode=explode)
ax.axis('equal')
legend_labels = ['Customers Churned', 'Customers Retained']
# Adding a legend
legend = ax.legend(legend_labels, title="", loc="center left", bbox_to_anchor=(1.1, 1))
legend.get_frame().set_edgecolor('black')
# Setting the title
plt.title('Customer Churn Analysis', fontweight='bold')
plt.show()
The pie chart above shows that 16.8% of the total customer based churned while 83.2% are still using the services of the e-commerce company
#visualising the percentage of customers that churned
ecd_clean['Churn'].value_counts().plot.pie(labels=['Customers who stayed', 'Customers who left'],
autopct='%1.1f%%',shadow=True, colors=sns.color_palette('Reds'))
plt.title("Proportion of Customers churned")
plt.show()
#checking out churn rate by gender distribution using a pivot table
ecd_clean.pivot_table(values="Tenure", index="Gender", columns="Churn",
aggfunc='sum')
| Churn | 0 | 1 |
|---|---|---|
| Gender | ||
| Female | 22,011.00 | 946.00 |
| Male | 29,738.00 | 1,984.00 |
#visualing the gender distibution with churn rate
sns.catplot(x='Churn', data=ecd_1, hue='Gender', hue_order=['Male', 'Female'],
kind='count', palette=['red', 'darkred'], # Set custom red shades
order=['Customer Retained', 'Customer Churned'])
plt.title("Churn Rate by Gender")
plt.show()
sns.catplot(x='TenureGroup', data=ecd_clean, kind='count',palette=sns.color_palette('RdBu'))
plt.xticks(rotation=90)
plt.show()
The count plot shows the distribution of customers segmented into different tenure groups. Most of the customers fall into the category of those that have been with the company for less than 10 years.
sns.catplot(x='TenureGroup', y='Churn', data=ecd_clean, kind='bar', palette=sns.color_palette('RdBu'))
plt.ylabel('Churn rate')
plt.xticks(rotation=90)
plt.show()
The above plot shows the mean of customer's that churn per catergory with a 95% confidence interval.
# Set context and font scale for better readability
sns.set_context("paper", font_scale=1.1)
# Create the plot using histplot
plt.figure(figsize=(10, 6))
sns.histplot(data=ecd_clean, x="DaySinceLastOrder", hue="Churn", multiple="stack", palette=["red", "green"], kde=False)
# Add titles and labels
plt.title('Analysis of Churn based on the Days Passed from Last Order Date')
plt.xlabel('Days Since Last Order')
plt.ylabel('Churn Rate')
# Show the legend with custom labels
plt.legend(title='Customer Status', labels=["Customers Retained", "Customers Churned"], loc='upper right')
# Display the plot
plt.show()
sns.set_context("paper",font_scale=1.1)
ax = sns.kdeplot(ecd.DaySinceLastOrder[(ecd_clean["Churn"] == 0) ],
color="Red", shade = True);
ax = sns.kdeplot(ecd_clean.DaySinceLastOrder[(ecd_clean["Churn"] == 1) ],
ax =ax, color="green", shade= True);
ax.legend(["Customers Retained","Customers Churned"],loc='upper right');
ax.set_ylabel('Churn Rate');
ax.set_xlabel('DaysSinceLastOrder');
ax.set_title('Analysis of Churn based on the days passed from Last Order Date');
palette = sns.color_palette('tab10')
sns.lineplot(
data=ecd, x='OrderCount', y="Churn",
hue="CityTier", ci=None,
palette=palette
)
#CityTier
<Axes: xlabel='OrderCount', ylabel='Churn'>
We see that for higher orders places, due to multiple reasons which could be operations, or availability of stock at warehouses , there is a high churn in tier. 1 and tier 3. Cities have the highest churn and it’s the lowers in tier 2
# Create a custom color palette for churn categories
palette = ["#FF6347", "#FFA07A"] # Red for churned, Orange for retained
# Create the violin plot for Churn by Satisfaction Score
plt.figure(figsize=(10, 6))
sns.violinplot(data=ecd_1, x='Churn', y='SatisfactionScore', palette=palette)
# Add titles and labels
plt.title('Churn Distribution by Satisfaction Rate')
plt.xlabel('Churn')
plt.ylabel('Satisfaction Score')
# Display the plot
plt.show()
order = ['Customer Retained', 'Customer Churned']
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
sns.countplot(data=ecd_1, x='Complain', hue='Churn', ax=axes[0], hue_order=order)
sns.countplot(data=ecd_1, x='SatisfactionScore', hue='Churn', ax=axes[1], hue_order=order)
#sns.countplot(data=ecd, x='PaymentMethod', hue='Churn', ax=axes[1, 0])
#sns.countplot(data=ecd, x='TechSupport', hue='Churn', ax=axes[1, 1])
plt.tight_layout()
About 50% of the customers that complained about the customer's services actually churned afterwards. Even though, customers provided a high satisfaction score with the service, some of these customers still left.
import seaborn as sns
import matplotlib.pyplot as plt
# Calculate churn distribution by Preferred Order Category
churn_distribution = ecd_1.groupby(['PreferedOrderCat', 'Churn']).size().reset_index(name='Count')
# Pivot the data to make it easier to plot
churn_pivot = churn_distribution.pivot_table(index='PreferedOrderCat', columns='Churn', values='Count', aggfunc='sum').fillna(0)
# Plot a pie chart for each category in PreferedOrderCat
fig, axes = plt.subplots(1, len(churn_pivot), figsize=(15, 6))
# Define colors for Churned and Retained
colors = ['#FF6347', '#FFA07A'] # Red for churned, Orange for retained
for idx, category in enumerate(churn_pivot.index):
# Plot pie chart for each 'PreferedOrderCat'
axes[idx].pie(churn_pivot.loc[category], labels=["Churned", "Retained"], colors=colors, autopct='%1.1f%%', startangle=90)
axes[idx].set_title(f"Churn Distribution - {category}")
# Adjust the layout
plt.tight_layout()
# Display the plot
plt.show()
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
sns.countplot(data=ecd_1, x='CityTier', hue='Churn', ax=axes[0], hue_order=order)
sns.countplot(data=ecd_1, x='PreferedOrderCat', hue='Churn', ax=axes[1],
order=['Laptop & Accessory', 'Mobile Phone', 'Fashion', 'Grocey', 'Others'],
hue_order = order)
plt.tight_layout()
Customers in CityTier 2 had the lowest churn rate in relation to Tier 1 and 3. Also in terms of prefered order catergoey of customers, those in the Mobile Phone catergory had the highest churn rate followed by Laptop & Accessory
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
sns.countplot(data=ecd_1, x='HourSpendOnApp', hue='Churn', ax=axes[0], hue_order=order)
sns.countplot(data=ecd_1, x='NumberOfDeviceRegistered', hue='Churn', ax=axes[1], hue_order=order)
plt.tight_layout()
Those customer that spent an average of 3 hours on the device and had an average of 4 registered devices, had the highest churn rate.
#subsetting the dataset to determine the churn characteristics of customer that registered a complaint with the company
count_of_complaints=ecd_clean.groupby(['Complain','Churn']).apply(lambda x:x['Churn'].count()).reset_index(name='No. of Customers')
count_of_complaints
| Complain | Churn | No. of Customers | |
|---|---|---|---|
| 0 | 0 | 0 | 3586 |
| 1 | 0 | 1 | 440 |
| 2 | 1 | 0 | 1096 |
| 3 | 1 | 1 | 508 |
complain_values = [0, 1]
churn_0 = [3586, 1096] # Customer Retained
churn_1 = [440, 508] # Customer Churned
complain_labels = ['No Complaint', 'Complaint']
bar_width = 0.30
r1 = range(len(complain_values))
r2 = [x + bar_width for x in r1]
fig, ax = plt.subplots(figsize=(10, 6))
plt.bar(r1, churn_0, color='#66B3FF', width=bar_width, edgecolor='grey', label='Customers Stayed')
plt.bar(r2, churn_1, color='#FF4433', width=bar_width, edgecolor='grey', label='Customers Left')
for i in range(len(complain_values)):
plt.text(r1[i], churn_0[i] + 20, str(churn_0[i]), ha='center', va='bottom', color='black', fontweight='bold')
plt.text(r2[i], churn_1[i] + 50, str(churn_1[i]), ha='center', va='bottom', color='black', fontweight='bold')
plt.xlabel('Complain', fontweight='bold')
plt.xticks([r + bar_width/2 for r in range(len(complain_values))], complain_labels)
plt.ylabel('No. of Customers', fontweight='bold')
plt.title('Complaint counts of Customers in a Company', fontweight='bold')
plt.legend()
plt.show()
Model Training and Prediction¶
# Import all the libraries for machine learning models
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,precision_score,recall_score,f1_score
import category_encoders as ce
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
ecd_clean.nunique(), ecd_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5630 entries, 0 to 5629 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 5630 non-null int64 1 Churn 5630 non-null int64 2 Tenure 5630 non-null float64 3 PreferredLoginDevice 5630 non-null object 4 CityTier 5630 non-null int64 5 WarehouseToHome 5630 non-null float64 6 PreferredPaymentMode 5630 non-null object 7 Gender 5630 non-null object 8 HourSpendOnApp 5630 non-null float64 9 NumberOfDeviceRegistered 5630 non-null int64 10 PreferedOrderCat 5630 non-null object 11 SatisfactionScore 5630 non-null int64 12 MaritalStatus 5630 non-null object 13 NumberOfAddress 5630 non-null int64 14 Complain 5630 non-null int64 15 OrderAmountHikeFromlastYear 5630 non-null float64 16 CouponUsed 5630 non-null float64 17 OrderCount 5630 non-null float64 18 DaySinceLastOrder 5630 non-null float64 19 CashbackAmount 5630 non-null int64 20 TenureGroup 5630 non-null category dtypes: category(1), float64(7), int64(8), object(5) memory usage: 885.7+ KB
(CustomerID 5630 Churn 2 Tenure 36 PreferredLoginDevice 2 CityTier 3 WarehouseToHome 35 PreferredPaymentMode 5 Gender 2 HourSpendOnApp 6 NumberOfDeviceRegistered 6 PreferedOrderCat 5 SatisfactionScore 5 MaritalStatus 3 NumberOfAddress 15 Complain 2 OrderAmountHikeFromlastYear 17 CouponUsed 17 OrderCount 17 DaySinceLastOrder 22 CashbackAmount 220 TenureGroup 7 dtype: int64, None)
#converting the category dtype of TenureGroup column to numerical value
encoder = ce.OrdinalEncoder(mapping=[{'col': 'TenureGroup', 'mapping': {'0-10 years': 1, '11-20 years': 2, '21-30 years':3,
'31-40 years': 4, '41-50 years': 5, '51-60 years': 6,
'61 years & above': 7}}])
encoder.fit(ecd_clean)
ecd_clean = encoder.transform(ecd_clean)
ecd_clean.head()
| CustomerID | Churn | Tenure | PreferredLoginDevice | CityTier | WarehouseToHome | PreferredPaymentMode | Gender | HourSpendOnApp | NumberOfDeviceRegistered | ... | SatisfactionScore | MaritalStatus | NumberOfAddress | Complain | OrderAmountHikeFromlastYear | CouponUsed | OrderCount | DaySinceLastOrder | CashbackAmount | TenureGroup | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50001 | 1 | 4.00 | Mobile Phone | 3 | 6.00 | Debit Card | Female | 3.00 | 3 | ... | 2 | Single | 9 | 1 | 11.00 | 1.00 | 1.00 | 5.00 | 160 | 1 |
| 1 | 50002 | 1 | 0.00 | Mobile Phone | 1 | 8.00 | UPI | Male | 3.00 | 4 | ... | 3 | Single | 7 | 1 | 15.00 | 0.00 | 1.00 | 0.00 | 121 | 1 |
| 2 | 50003 | 1 | 0.00 | Mobile Phone | 1 | 30.00 | Debit Card | Male | 2.00 | 4 | ... | 3 | Single | 6 | 1 | 14.00 | 0.00 | 1.00 | 3.00 | 120 | 1 |
| 3 | 50004 | 1 | 0.00 | Mobile Phone | 3 | 15.00 | Debit Card | Male | 2.00 | 4 | ... | 5 | Single | 8 | 0 | 23.00 | 0.00 | 1.00 | 3.00 | 134 | 1 |
| 4 | 50005 | 1 | 0.00 | Mobile Phone | 1 | 12.00 | Credit Card | Male | 0.00 | 3 | ... | 5 | Single | 3 | 0 | 11.00 | 1.00 | 1.00 | 3.00 | 130 | 1 |
5 rows × 21 columns
ecd_clean['TenureGroup'].unique()
#importing libraries for preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
#defining coloumns (target column, needed for encoding later)
cust_id = ['CustomerID']
Target = ["Churn"]
#creating categorical list columns with unique values less than 7
cat_variable = ecd_clean.nunique()[ecd_clean.nunique() < 7].keys().tolist()
cat_variable = [x for x in cat_variable if x not in Target]
#creating numnerical list columns
numerical_vari = [x for x in ecd_clean if x not in cat_variable + Target + cust_id]
#separating columns with only 2 unique values as binary
binary_vari = ecd_clean.nunique()[ecd_clean.nunique() == 2].keys().tolist()
#separating rest into multiple
more_then_2_vari = [i for i in cat_variable if i not in binary_vari]
#label encoding binary columns
lab_encod = LabelEncoder()
for x in binary_vari:
ecd_clean[x] = lab_encod.fit_transform(ecd_clean[x])
ecd_clean = pd.get_dummies(ecd_clean, columns=more_then_2_vari)
#standardizing numerical columns for preprocessing
scaler = StandardScaler()
numerical_cols = ["Tenure", "WarehouseToHome", "NumberOfAddress", "CashbackAmount"]
ecd_clean[numerical_cols] = scaler.fit_transform(ecd_clean[numerical_cols])
#summary
ecd_clean.describe().transpose()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| CustomerID | 5,630.00 | 52,815.50 | 1,625.39 | 50,001.00 | 51,408.25 | 52,815.50 | 54,222.75 | 55,630.00 |
| Churn | 5,630.00 | 0.17 | 0.37 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| Tenure | 5,630.00 | 0.00 | 1.00 | -1.13 | -1.01 | -0.20 | 0.61 | 5.95 |
| PreferredLoginDevice | 5,630.00 | 0.71 | 0.45 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 |
| WarehouseToHome | 5,630.00 | 0.00 | 1.00 | -1.67 | -0.66 | -0.22 | 0.57 | 12.53 |
| Gender | 5,630.00 | 0.60 | 0.49 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 |
| NumberOfAddress | 5,630.00 | 0.00 | 1.00 | -1.24 | -0.86 | -0.47 | 0.69 | 6.88 |
| Complain | 5,630.00 | 0.28 | 0.45 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 |
| OrderAmountHikeFromlastYear | 5,630.00 | 14.97 | 4.89 | 0.00 | 13.00 | 14.00 | 18.00 | 26.00 |
| CouponUsed | 5,630.00 | 1.67 | 1.89 | 0.00 | 1.00 | 1.00 | 2.00 | 16.00 |
| OrderCount | 5,630.00 | 2.87 | 2.94 | 0.00 | 1.00 | 2.00 | 3.00 | 16.00 |
| DaySinceLastOrder | 5,630.00 | 4.30 | 3.70 | 0.00 | 1.00 | 3.00 | 7.00 | 46.00 |
| CashbackAmount | 5,630.00 | 0.00 | 1.00 | -3.60 | -0.63 | -0.29 | 0.38 | 3.00 |
| TenureGroup | 5,630.00 | 1.55 | 0.75 | 1.00 | 1.00 | 1.00 | 2.00 | 7.00 |
#calculating coorelation
corr_ecd = ecd_clean.corr()
matrix_cols = corr_ecd.columns.tolist()
corr_array = np.array(corr_ecd)
corr_ecd
| CustomerID | Churn | Tenure | PreferredLoginDevice | WarehouseToHome | Gender | NumberOfAddress | Complain | OrderAmountHikeFromlastYear | CouponUsed | ... | PreferedOrderCat_Mobile Phone | PreferedOrderCat_Others | SatisfactionScore_1 | SatisfactionScore_2 | SatisfactionScore_3 | SatisfactionScore_4 | SatisfactionScore_5 | MaritalStatus_Divorced | MaritalStatus_Married | MaritalStatus_Single | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| CustomerID | 1.00 | -0.02 | 0.08 | -0.00 | 0.12 | 0.00 | 0.16 | -0.01 | 0.03 | 0.24 | ... | -0.00 | -0.01 | 0.14 | -0.20 | -0.00 | 0.01 | 0.01 | -0.24 | 0.18 | -0.01 |
| Churn | -0.02 | 1.00 | -0.35 | -0.05 | 0.03 | 0.03 | 0.04 | 0.25 | 0.04 | 0.01 | ... | 0.22 | -0.05 | -0.07 | -0.04 | 0.01 | 0.00 | 0.09 | -0.02 | -0.15 | 0.18 |
| Tenure | 0.08 | -0.35 | 1.00 | 0.04 | 0.02 | -0.05 | 0.25 | -0.02 | -0.18 | 0.10 | ... | -0.37 | 0.27 | 0.01 | -0.00 | 0.00 | 0.01 | -0.03 | 0.04 | 0.08 | -0.12 |
| PreferredLoginDevice | -0.00 | -0.05 | 0.04 | 1.00 | -0.02 | -0.02 | 0.03 | 0.00 | -0.04 | -0.02 | ... | 0.01 | 0.05 | -0.02 | 0.01 | -0.04 | 0.01 | 0.04 | 0.03 | -0.00 | -0.02 |
| WarehouseToHome | 0.12 | 0.03 | 0.02 | -0.02 | 1.00 | -0.00 | 0.01 | 0.02 | 0.04 | 0.03 | ... | -0.14 | -0.05 | -0.00 | -0.02 | 0.01 | 0.00 | 0.00 | -0.00 | 0.04 | -0.04 |
| Gender | 0.00 | 0.03 | -0.05 | -0.02 | -0.00 | 1.00 | -0.03 | -0.04 | -0.02 | -0.03 | ... | 0.05 | 0.03 | 0.03 | 0.01 | -0.01 | -0.03 | -0.01 | -0.01 | 0.04 | -0.03 |
| NumberOfAddress | 0.16 | 0.04 | 0.25 | 0.03 | 0.01 | -0.03 | 1.00 | -0.03 | -0.04 | 0.03 | ... | -0.14 | 0.05 | -0.04 | -0.01 | -0.01 | 0.03 | 0.03 | -0.00 | 0.01 | -0.01 |
| Complain | -0.01 | 0.25 | -0.02 | 0.00 | 0.02 | -0.04 | -0.03 | 1.00 | 0.01 | -0.01 | ... | 0.01 | -0.01 | 0.04 | 0.00 | -0.01 | -0.04 | 0.01 | 0.01 | -0.00 | -0.00 |
| OrderAmountHikeFromlastYear | 0.03 | 0.04 | -0.18 | -0.04 | 0.04 | -0.02 | -0.04 | 0.01 | 1.00 | -0.09 | ... | 0.13 | -0.49 | 0.03 | 0.01 | -0.03 | -0.01 | 0.01 | -0.03 | -0.01 | 0.04 |
| CouponUsed | 0.24 | 0.01 | 0.10 | -0.02 | 0.03 | -0.03 | 0.03 | -0.01 | -0.09 | 1.00 | ... | -0.12 | 0.05 | -0.01 | 0.01 | -0.02 | 0.01 | 0.02 | 0.03 | 0.01 | -0.03 |
| OrderCount | 0.13 | -0.02 | 0.16 | -0.01 | 0.02 | -0.02 | -0.03 | -0.03 | -0.13 | 0.59 | ... | -0.18 | 0.18 | -0.00 | -0.00 | -0.01 | -0.00 | 0.02 | 0.02 | 0.03 | -0.05 |
| DaySinceLastOrder | 0.08 | -0.15 | 0.18 | 0.00 | 0.05 | -0.02 | -0.07 | -0.04 | -0.12 | 0.25 | ... | -0.26 | 0.18 | -0.01 | -0.01 | -0.02 | -0.00 | 0.04 | 0.01 | 0.04 | -0.05 |
| CashbackAmount | 0.22 | -0.15 | 0.51 | 0.05 | 0.07 | -0.03 | 0.19 | 0.00 | -0.38 | 0.17 | ... | -0.58 | 0.57 | -0.00 | 0.00 | -0.00 | 0.00 | 0.00 | 0.04 | 0.05 | -0.09 |
| TenureGroup | 0.05 | -0.25 | 0.93 | 0.04 | 0.02 | -0.04 | 0.23 | 0.00 | -0.17 | 0.06 | ... | -0.30 | 0.27 | -0.00 | -0.01 | 0.02 | 0.01 | -0.02 | 0.04 | 0.06 | -0.10 |
| CityTier_1 | -0.01 | -0.08 | 0.04 | -0.00 | -0.03 | 0.02 | 0.03 | -0.00 | 0.00 | -0.02 | ... | 0.23 | 0.03 | -0.02 | 0.01 | 0.04 | -0.04 | 0.01 | -0.01 | 0.02 | -0.01 |
| CityTier_2 | 0.02 | 0.02 | 0.01 | 0.01 | -0.01 | 0.04 | 0.01 | -0.01 | -0.05 | -0.03 | ... | 0.07 | 0.06 | 0.00 | -0.04 | -0.03 | 0.07 | -0.01 | -0.01 | 0.05 | -0.05 |
| CityTier_3 | -0.00 | 0.08 | -0.05 | -0.00 | 0.04 | -0.03 | -0.03 | 0.01 | 0.02 | 0.03 | ... | -0.27 | -0.06 | 0.02 | 0.01 | -0.03 | 0.01 | -0.01 | 0.02 | -0.04 | 0.03 |
| PreferredPaymentMode_Cash on Delivery | -0.00 | 0.07 | -0.02 | 0.02 | -0.02 | 0.01 | -0.01 | -0.02 | -0.01 | 0.02 | ... | 0.07 | -0.04 | -0.03 | -0.00 | 0.01 | 0.02 | -0.00 | 0.00 | -0.02 | 0.02 |
| PreferredPaymentMode_Credit Card | -0.00 | -0.05 | 0.00 | -0.00 | 0.01 | -0.02 | 0.02 | 0.00 | 0.00 | -0.02 | ... | 0.05 | 0.02 | -0.02 | -0.03 | 0.03 | -0.01 | 0.01 | -0.00 | 0.01 | -0.01 |
| PreferredPaymentMode_Debit Card | 0.00 | -0.03 | 0.02 | -0.03 | -0.03 | 0.00 | -0.00 | -0.01 | 0.02 | -0.00 | ... | 0.01 | -0.01 | 0.03 | 0.03 | 0.02 | -0.05 | -0.02 | -0.01 | -0.02 | 0.02 |
| PreferredPaymentMode_E wallet | -0.01 | 0.06 | 0.02 | 0.04 | 0.05 | -0.02 | -0.02 | 0.01 | -0.02 | 0.01 | ... | -0.17 | 0.00 | 0.00 | 0.00 | -0.06 | 0.06 | 0.01 | 0.02 | 0.00 | -0.02 |
| PreferredPaymentMode_UPI | 0.01 | 0.00 | -0.04 | 0.00 | 0.00 | 0.04 | 0.00 | 0.02 | -0.01 | -0.00 | ... | 0.02 | 0.03 | 0.01 | -0.01 | -0.03 | 0.03 | 0.01 | -0.01 | 0.04 | -0.03 |
| HourSpendOnApp_0.0 | -0.19 | 0.03 | -0.10 | -0.00 | -0.01 | 0.02 | -0.11 | -0.01 | 0.02 | -0.08 | ... | 0.25 | -0.05 | 0.00 | 0.00 | 0.01 | -0.00 | -0.01 | -0.01 | 0.01 | -0.01 |
| HourSpendOnApp_1.0 | -0.06 | -0.04 | 0.03 | -0.03 | -0.01 | -0.02 | -0.03 | -0.01 | 0.02 | -0.02 | ... | -0.03 | -0.02 | -0.00 | 0.00 | 0.00 | -0.00 | 0.00 | -0.00 | -0.01 | 0.01 |
| HourSpendOnApp_2.0 | -0.49 | -0.02 | -0.00 | -0.01 | -0.09 | 0.02 | -0.11 | -0.00 | -0.04 | -0.15 | ... | -0.10 | 0.03 | 0.01 | 0.01 | 0.00 | 0.00 | -0.03 | -0.01 | -0.03 | 0.04 |
| HourSpendOnApp_3.0 | 0.16 | 0.01 | 0.01 | 0.01 | 0.02 | -0.00 | 0.05 | 0.01 | -0.01 | 0.04 | ... | -0.03 | 0.01 | -0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | -0.00 | -0.00 |
| HourSpendOnApp_4.0 | 0.45 | -0.00 | 0.03 | 0.01 | 0.08 | -0.02 | 0.11 | 0.00 | 0.04 | 0.16 | ... | 0.02 | -0.02 | -0.01 | -0.01 | -0.01 | -0.00 | 0.03 | 0.01 | 0.03 | -0.04 |
| HourSpendOnApp_5.0 | 0.02 | -0.01 | 0.01 | 0.01 | 0.01 | -0.01 | 0.01 | -0.01 | 0.01 | 0.03 | ... | -0.00 | -0.01 | 0.01 | -0.01 | -0.02 | 0.01 | 0.01 | -0.01 | -0.01 | 0.02 |
| NumberOfDeviceRegistered_1 | -0.15 | -0.04 | -0.01 | 0.01 | -0.03 | 0.02 | -0.05 | -0.00 | 0.01 | -0.06 | ... | 0.02 | -0.02 | 0.00 | -0.02 | 0.01 | -0.01 | 0.02 | -0.03 | 0.05 | -0.03 |
| NumberOfDeviceRegistered_2 | 0.14 | -0.05 | 0.02 | 0.01 | 0.03 | 0.02 | 0.03 | -0.00 | 0.03 | 0.03 | ... | 0.01 | -0.02 | -0.01 | 0.00 | -0.00 | -0.01 | 0.02 | 0.00 | 0.03 | -0.03 |
| NumberOfDeviceRegistered_3 | -0.55 | -0.03 | -0.03 | -0.00 | -0.07 | 0.00 | -0.10 | 0.00 | -0.03 | -0.18 | ... | -0.01 | 0.01 | 0.01 | 0.00 | -0.00 | -0.00 | -0.01 | 0.02 | -0.01 | -0.01 |
| NumberOfDeviceRegistered_4 | 0.29 | -0.01 | 0.04 | 0.00 | 0.03 | -0.01 | 0.07 | -0.00 | -0.02 | 0.09 | ... | -0.00 | 0.00 | -0.00 | -0.00 | -0.00 | 0.01 | -0.01 | 0.00 | -0.01 | 0.01 |
| NumberOfDeviceRegistered_5 | 0.23 | 0.06 | -0.01 | -0.00 | 0.03 | -0.01 | 0.04 | 0.00 | 0.02 | 0.09 | ... | 0.00 | 0.00 | -0.01 | 0.01 | -0.00 | 0.01 | -0.00 | -0.01 | -0.01 | 0.02 |
| NumberOfDeviceRegistered_6 | 0.15 | 0.08 | -0.03 | -0.03 | 0.03 | 0.01 | 0.02 | 0.01 | 0.04 | 0.06 | ... | -0.00 | 0.01 | 0.02 | 0.00 | 0.01 | -0.03 | -0.01 | -0.02 | 0.01 | 0.01 |
| PreferedOrderCat_Fashion | -0.01 | -0.01 | 0.12 | 0.02 | 0.06 | -0.03 | 0.04 | 0.01 | 0.07 | 0.14 | ... | -0.32 | -0.09 | -0.00 | 0.00 | -0.01 | -0.01 | 0.02 | 0.00 | 0.03 | -0.03 |
| PreferedOrderCat_Grocery | 0.00 | -0.09 | 0.35 | 0.01 | 0.05 | -0.05 | 0.07 | 0.01 | -0.13 | 0.01 | ... | -0.21 | -0.06 | -0.00 | 0.02 | -0.02 | 0.01 | 0.00 | 0.04 | 0.01 | -0.04 |
| PreferedOrderCat_Laptop & Accessory | 0.01 | -0.13 | -0.02 | -0.05 | 0.09 | -0.02 | 0.04 | -0.02 | 0.11 | -0.01 | ... | -0.58 | -0.17 | 0.02 | -0.00 | 0.01 | -0.03 | -0.00 | -0.01 | 0.03 | -0.03 |
| PreferedOrderCat_Mobile Phone | -0.00 | 0.22 | -0.37 | 0.01 | -0.14 | 0.05 | -0.14 | 0.01 | 0.13 | -0.12 | ... | 1.00 | -0.17 | -0.01 | -0.00 | -0.00 | 0.02 | -0.00 | -0.02 | -0.07 | 0.09 |
| PreferedOrderCat_Others | -0.01 | -0.05 | 0.27 | 0.05 | -0.05 | 0.03 | 0.05 | -0.01 | -0.49 | 0.05 | ... | -0.17 | 1.00 | -0.01 | -0.02 | 0.03 | 0.02 | -0.03 | 0.02 | 0.03 | -0.04 |
| SatisfactionScore_1 | 0.14 | -0.07 | 0.01 | -0.02 | -0.00 | 0.03 | -0.04 | 0.04 | 0.03 | -0.01 | ... | -0.01 | -0.01 | 1.00 | -0.17 | -0.34 | -0.25 | -0.25 | -0.21 | 0.14 | 0.02 |
| SatisfactionScore_2 | -0.20 | -0.04 | -0.00 | 0.01 | -0.02 | 0.01 | -0.01 | 0.00 | 0.01 | 0.01 | ... | -0.00 | -0.02 | -0.17 | 1.00 | -0.22 | -0.17 | -0.17 | 0.27 | -0.22 | 0.02 |
| SatisfactionScore_3 | -0.00 | 0.01 | 0.00 | -0.04 | 0.01 | -0.01 | -0.01 | -0.01 | -0.03 | -0.02 | ... | -0.00 | 0.03 | -0.34 | -0.22 | 1.00 | -0.32 | -0.33 | 0.02 | 0.01 | -0.02 |
| SatisfactionScore_4 | 0.01 | 0.00 | 0.01 | 0.01 | 0.00 | -0.03 | 0.03 | -0.04 | -0.01 | 0.01 | ... | 0.02 | 0.02 | -0.25 | -0.17 | -0.32 | 1.00 | -0.24 | -0.01 | 0.01 | 0.00 |
| SatisfactionScore_5 | 0.01 | 0.09 | -0.03 | 0.04 | 0.00 | -0.01 | 0.03 | 0.01 | 0.01 | 0.02 | ... | -0.00 | -0.03 | -0.25 | -0.17 | -0.33 | -0.24 | 1.00 | -0.00 | 0.01 | -0.01 |
| MaritalStatus_Divorced | -0.24 | -0.02 | 0.04 | 0.03 | -0.00 | -0.01 | -0.00 | 0.01 | -0.03 | 0.03 | ... | -0.02 | 0.02 | -0.21 | 0.27 | 0.02 | -0.01 | -0.00 | 1.00 | -0.45 | -0.29 |
| MaritalStatus_Married | 0.18 | -0.15 | 0.08 | -0.00 | 0.04 | 0.04 | 0.01 | -0.00 | -0.01 | 0.01 | ... | -0.07 | 0.03 | 0.14 | -0.22 | 0.01 | 0.01 | 0.01 | -0.45 | 1.00 | -0.73 |
| MaritalStatus_Single | -0.01 | 0.18 | -0.12 | -0.02 | -0.04 | -0.03 | -0.01 | -0.00 | 0.04 | -0.03 | ... | 0.09 | -0.04 | 0.02 | 0.02 | -0.02 | 0.00 | -0.01 | -0.29 | -0.73 | 1.00 |
47 rows × 47 columns
import plotly.graph_objs as go
import plotly.offline as py
#Plotting
trace = go.Heatmap(z = corr_array,
x = matrix_cols,
y = matrix_cols,
colorscale = "Viridis",
colorbar = dict(title = "Pearson Correlation coefficient",
titleside = "right"
) ,
)
layout = go.Layout(dict(title = "Correlation Matrix for variables",
autosize = False,
height = 720,
width = 800,
margin = dict(r = 0 ,l = 210,
t = 25,b = 210,
),
yaxis = dict(tickfont = dict(size = 9)),
xaxis = dict(tickfont = dict(size = 9))
)
)
data = [trace]
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
#splitting dataset into train and test data
train,test = train_test_split(ecd_clean,test_size = .20 ,random_state = 0)
cols = [i for i in ecd_clean.columns if i not in cust_id + Target]
X_train = train[cols]
Y_train = train[Target]
X_test = test[cols]
Y_test = test[Target]
#logistic Regression
# Create an instance of the LogisticRegression model
logistic_regression_model = LogisticRegression(random_state=0)
# Fit the model to the training data
logistic_regression_model.fit(X_train, Y_train)
# Make predictions on the test data
Y_pred = logistic_regression_model.predict(X_test)
# Evaluate the model's performance
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
# Print the evaluation metrics
print("Accuracy:", accuracy)
print("")
print("Precision:", precision)
print("")
print("Recall:", recall)
print("")
print("F1 Score:", f1)
print("")
# Generate a confusion matrix
confusion_matrix_result = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:")
print(confusion_matrix_result)
print("")
df_cm = pd.DataFrame(confusion_matrix_result, index = (0, 1), columns = (0, 1))
plt.figure(figsize = (10, 7))
sns.heatmap(df_cm, annot = True, fmt ='g', cmap='hot')
print("Test Data Accuracy: %.6f" %accuracy_score(Y_test, Y_pred))
Accuracy: 0.8809946714031972 Precision: 0.712 Recall: 0.47593582887700536 F1 Score: 0.5705128205128205 Confusion Matrix: [[903 36] [ 98 89]] Test Data Accuracy: 0.880995
# Generate a classification report
classification_report_result = classification_report(Y_test, Y_pred)
print("Classification Report for Logistic Regression:")
print(classification_report_result)
Classification Report for Logistic Regression:
precision recall f1-score support
0 0.90 0.96 0.93 939
1 0.71 0.48 0.57 187
accuracy 0.88 1126
macro avg 0.81 0.72 0.75 1126
weighted avg 0.87 0.88 0.87 1126
Decision Tree Classifier and Random Forest Classifier
#Building our model using Decision Tree Classifer
X = ecd_clean.drop(['Churn'],axis=1)
y = ecd_clean['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=128)
deci_tree = DecisionTreeClassifier()
deci_tree.fit(X_train,y_train)
deci_tree_pred = deci_tree.predict(X_test)
recall_score(y_test, deci_tree_pred)
# Calculate and print accuracy
accuracy_dt = accuracy_score(y_test, deci_tree_pred)
precision_dt = precision_score(y_test, deci_tree_pred)
recall_dt = recall_score(y_test, deci_tree_pred)
print("Accuracy:", accuracy_dt)
print("Precision:", precision_dt)
print("Recall:", recall_dt)
#confusion matrix
conf_matrix = confusion_matrix(y_test, deci_tree_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()
Accuracy: 0.9316163410301954 Precision: 0.7723214285714286 Recall: 0.8693467336683417
#classification report for decision tree
class_report = classification_report(y_test, deci_tree_pred)
print("Classification Report for Decision Tree:\n", class_report)
Classification Report for Decision Tree:
precision recall f1-score support
0 0.97 0.94 0.96 927
1 0.77 0.87 0.82 199
accuracy 0.93 1126
macro avg 0.87 0.91 0.89 1126
weighted avg 0.94 0.93 0.93 1126
#generating future importance
feature_importance1 = deci_tree.feature_importances_
feature_names2 = X.columns
# Create a bar plot to visualize feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_names2, feature_importance1)
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('Decision Tree Feature Importance')
plt.show()
#building our model using Random Forest Classifer
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=324)
rand_fcl = RandomForestClassifier(n_estimators=120)
rand_fcl.fit(X_train,y_train)
rand_fcl_pred = rand_fcl.predict(X_test)
recall_score(y_test, rand_fcl_pred)
# Evaluate the model's performance
accuracy_rfc = accuracy_score(y_test, rand_fcl_pred)
precision_rfc = precision_score(y_test, rand_fcl_pred)
recall_rfc = recall_score(y_test, rand_fcl_pred)
print("Accuracy:", accuracy_rfc)
print("Precision:", precision_rfc)
print("Recall:", recall_rfc)
# Create a confusion matrix
conf_matrix = confusion_matrix(y_test, rand_fcl_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, cmap="gray", fmt="d")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix Random Forest Classifier")
plt.show()
Accuracy: 0.9609236234458259 Precision: 0.9743589743589743 Recall: 0.7916666666666666
feature_importance2 = rand_fcl.feature_importances_
feature_names2 = X.columns
# Create a bar plot to visualize feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_names2, feature_importance2)
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('Random Forest Feature Importance')
plt.show()